rm(list = ls())
library(naniar)
library(ggplot2)
library(reshape2)
library(magrittr)
library(dplyr)
library(car)
library(GGally)
library(viridis)
library(caret)
library(ggplot2)
library(cowplot)
library(FNN)
library(MASS)
library(rpart)
library(rpart.plot)
library(adabag)
library(randomForest)
library(imbalance)
library(e1071)
library(neuralnet)
library(nnet)
# setwd("~/GitHub/CVTDM_Project")
wine = read.csv(file = "winequality-white.csv", header = T, sep = ";")
First, let’s have a look at the variables and their types:
str(wine)
## 'data.frame': 4898 obs. of 12 variables:
## $ fixed.acidity : num 7 6.3 8.1 7.2 7.2 8.1 6.2 7 6.3 8.1 ...
## $ volatile.acidity : num 0.27 0.3 0.28 0.23 0.23 0.28 0.32 0.27 0.3 0.22 ...
## $ citric.acid : num 0.36 0.34 0.4 0.32 0.32 0.4 0.16 0.36 0.34 0.43 ...
## $ residual.sugar : num 20.7 1.6 6.9 8.5 8.5 6.9 7 20.7 1.6 1.5 ...
## $ chlorides : num 0.045 0.049 0.05 0.058 0.058 0.05 0.045 0.045 0.049 0.044 ...
## $ free.sulfur.dioxide : num 45 14 30 47 47 30 30 45 14 28 ...
## $ total.sulfur.dioxide: num 170 132 97 186 186 97 136 170 132 129 ...
## $ density : num 1.001 0.994 0.995 0.996 0.996 ...
## $ pH : num 3 3.3 3.26 3.19 3.19 3.26 3.18 3 3.3 3.22 ...
## $ sulphates : num 0.45 0.49 0.44 0.4 0.4 0.44 0.47 0.45 0.49 0.45 ...
## $ alcohol : num 8.8 9.5 10.1 9.9 9.9 10.1 9.6 8.8 9.5 11 ...
## $ quality : int 6 6 6 6 6 6 6 6 6 6 ...
Now, let’s create a binned_quality variable for the purpose of the problem:
sapply(wine, function(x) length(unique(x)))
## fixed.acidity volatile.acidity citric.acid
## 68 125 87
## residual.sugar chlorides free.sulfur.dioxide
## 310 160 132
## total.sulfur.dioxide density pH
## 251 890 103
## sulphates alcohol quality
## 79 103 7
wine$binned_quality = as.factor(ifelse(wine$quality < 5, 'Low',
ifelse(wine$quality >= 5 & wine$quality < 7, "Intermediate",
ifelse(wine$quality >= 7, "High", "None"))))
wine$quality = as.factor(wine$quality)
We can also have a look at the summary statistics:
summary(wine)
## fixed.acidity volatile.acidity citric.acid residual.sugar
## Min. : 3.800 Min. :0.0800 Min. :0.0000 Min. : 0.600
## 1st Qu.: 6.300 1st Qu.:0.2100 1st Qu.:0.2700 1st Qu.: 1.700
## Median : 6.800 Median :0.2600 Median :0.3200 Median : 5.200
## Mean : 6.855 Mean :0.2782 Mean :0.3342 Mean : 6.391
## 3rd Qu.: 7.300 3rd Qu.:0.3200 3rd Qu.:0.3900 3rd Qu.: 9.900
## Max. :14.200 Max. :1.1000 Max. :1.6600 Max. :65.800
##
## chlorides free.sulfur.dioxide total.sulfur.dioxide density
## Min. :0.00900 Min. : 2.00 Min. : 9.0 Min. :0.9871
## 1st Qu.:0.03600 1st Qu.: 23.00 1st Qu.:108.0 1st Qu.:0.9917
## Median :0.04300 Median : 34.00 Median :134.0 Median :0.9937
## Mean :0.04577 Mean : 35.31 Mean :138.4 Mean :0.9940
## 3rd Qu.:0.05000 3rd Qu.: 46.00 3rd Qu.:167.0 3rd Qu.:0.9961
## Max. :0.34600 Max. :289.00 Max. :440.0 Max. :1.0390
##
## pH sulphates alcohol quality binned_quality
## Min. :2.720 Min. :0.2200 Min. : 8.00 3: 20 High :1060
## 1st Qu.:3.090 1st Qu.:0.4100 1st Qu.: 9.50 4: 163 Intermediate:3655
## Median :3.180 Median :0.4700 Median :10.40 5:1457 Low : 183
## Mean :3.188 Mean :0.4898 Mean :10.51 6:2198
## 3rd Qu.:3.280 3rd Qu.:0.5500 3rd Qu.:11.40 7: 880
## Max. :3.820 Max. :1.0800 Max. :14.20 8: 175
## 9: 5
sapply(wine[,-c(12,13)], sd)
## fixed.acidity volatile.acidity citric.acid
## 0.843868228 0.100794548 0.121019804
## residual.sugar chlorides free.sulfur.dioxide
## 5.072057784 0.021847968 17.007137325
## total.sulfur.dioxide density pH
## 42.498064554 0.002990907 0.151000600
## sulphates alcohol
## 0.114125834 1.230620568
str(wine)
## 'data.frame': 4898 obs. of 13 variables:
## $ fixed.acidity : num 7 6.3 8.1 7.2 7.2 8.1 6.2 7 6.3 8.1 ...
## $ volatile.acidity : num 0.27 0.3 0.28 0.23 0.23 0.28 0.32 0.27 0.3 0.22 ...
## $ citric.acid : num 0.36 0.34 0.4 0.32 0.32 0.4 0.16 0.36 0.34 0.43 ...
## $ residual.sugar : num 20.7 1.6 6.9 8.5 8.5 6.9 7 20.7 1.6 1.5 ...
## $ chlorides : num 0.045 0.049 0.05 0.058 0.058 0.05 0.045 0.045 0.049 0.044 ...
## $ free.sulfur.dioxide : num 45 14 30 47 47 30 30 45 14 28 ...
## $ total.sulfur.dioxide: num 170 132 97 186 186 97 136 170 132 129 ...
## $ density : num 1.001 0.994 0.995 0.996 0.996 ...
## $ pH : num 3 3.3 3.26 3.19 3.19 3.26 3.18 3 3.3 3.22 ...
## $ sulphates : num 0.45 0.49 0.44 0.4 0.4 0.44 0.47 0.45 0.49 0.45 ...
## $ alcohol : num 8.8 9.5 10.1 9.9 9.9 10.1 9.6 8.8 9.5 11 ...
## $ quality : Factor w/ 7 levels "3","4","5","6",..: 4 4 4 4 4 4 4 4 4 4 ...
## $ binned_quality : Factor w/ 3 levels "High","Intermediate",..: 2 2 2 2 2 2 2 2 2 2 ...
Let’s look at the distribution of the variables based on the quality variable (not binned, on a 0-10 scale):
boxplots = ggplot(data = melt(wine[,-13], "quality"), aes(quality, value, group = quality)) +
geom_boxplot(fill = "transparent", color = "black") +
facet_wrap(~variable, scale = "free", ncol = 3) +
theme_classic()
boxplots
First, one can see that there are not a lot of variation between wine of different quality. There is a slight increase in alcohol quantity for wine of better quality, as well as a slight increase in pH levels.
Let’s continue by exploring the distribution of each variable:
par(mfrow=c(2, 3))
for (i in 1:11) {
hist(wine[, i], main = c(names(wine[i])), xlab=names(wine[i]))
abline(v = mean(wine[, i]), col = 1, lwd = 2)
abline(v = median(wine[, i]), col = 2, lwd = 2)
}
par(mfrow=c(1, 1))
barplot(table(wine$binned_quality), main = c(names(wine$binned_quality)), xlab=names(wine$binned_quality))
Our dataset faces two problems we have to deal with: imbalance between quality groups (‘Intermediate’ quality is over-represented) and skewness in the distribution of the explanatory variables.
To correct for the skewness, we can log-transform the variables of interest:
alllogwine = wine
alllogwine[,-c(12,13)] = lapply(alllogwine[,-c(12,13)], log) #log transform all variables except quality and binned quality
boxplots = ggplot(data = melt(alllogwine[,-13], "quality"), aes(quality, value, group = quality)) +
geom_boxplot(fill = "transparent", color = "black") +
facet_wrap(~variable, scale = "free", ncol = 3) +
theme_classic()
boxplots
Let’s have a look at the distribution of the variables after log-tranformation:
par(mfrow=c(2, 3))
for (i in 1:11) {
hist(alllogwine[, i], main = c(names(alllogwine[i])), xlab=names(alllogwine[i]))
abline(v = mean(alllogwine[, i]), col = 1, lwd = 2)
abline(v = median(alllogwine[, i]), col = 2, lwd = 2)
}
Clearly, we see an improvement in the distribution of the variables, with less variability due to extreme observations. There are still some variables for which the transformation does not add much: citric.acid, total.sulfure.dioxide and pH. Indeed, these variables were already pretty well distributed. Also, note that density is still right-skewed.
Now, let’s have a look at the correlation between the explanatory variables:
cor_mat = round(cor(wine[,-c(12,13)]),2)
cor_mat2 = melt(cor_mat)
ggplot(data = cor_mat2, aes(x = Var1, y = Var2, fill = value)) +
geom_tile() +
geom_text(aes(Var2, Var1, label = value), color = "white", size = 3) +
labs(title = "Heatmap of the correlation table") +
theme(axis.text.x = element_text(angle=90))
We notice a high negative correlation coefficient between alcohol and density (-0.78). Because it is generally better to reduce the number of dimensions and because a high correlation might lead to multicollinearity issues, one can decide to drop the density variable.
Let’s look at the VIFs (Variance Inflation Factors) by performing a linear regression:
wine$quality = as.numeric(wine$quality)
model = lm(quality ~., data = wine[,-13])
vif(model)
## fixed.acidity volatile.acidity citric.acid
## 2.691435 1.141156 1.165215
## residual.sugar chlorides free.sulfur.dioxide
## 12.644064 1.236822 1.787880
## total.sulfur.dioxide density pH
## 2.239233 28.232546 2.196362
## sulphates alcohol
## 1.138540 7.706957
It appears that the density variable has a very high VIF (28.2) compared to other variables. Hence, let’s have a look at the model without density and see if there is an improvement:
model2 = lm(quality ~., data = wine[,-c(8,13)])
vif(model2)
## fixed.acidity volatile.acidity citric.acid
## 1.356128 1.128298 1.159884
## residual.sugar chlorides free.sulfur.dioxide
## 1.435215 1.203645 1.744627
## total.sulfur.dioxide pH sulphates
## 2.153170 1.330912 1.056637
## alcohol
## 1.647117
wine$quality = as.factor(wine$quality)
Clearly, the VIF of alcohol is lower compared to before (7.7 vs. 1.6). Thus, we will drop density from our models.
Finally, let’s look at this plot:
ggparcoord(wine, columns = 1:11, groupColumn = 13, showPoints = TRUE, alphaLines = 0.3, scale = "uniminmax") + scale_color_viridis(discrete = TRUE) + theme(axis.text.x = element_text(angle = 90))
First, let’s drop density and quality:
logwine = wine[,-c(8,12)]
Now, let’s log transform all variables except citric.acid, total.sulfure.dioxide, pH and binned_quality:
logwine[,-c(3,7,8,11)] = lapply(logwine[,-c(3,7,8,11)], log) #log transform all variables except citric.acid, total.sulfure.dioxide, pH and binned_quality
head(logwine)
## fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
## 1 1.945910 -1.309333 0.36 3.0301337 -3.101093
## 2 1.840550 -1.203973 0.34 0.4700036 -3.015935
## 3 2.091864 -1.272966 0.40 1.9315214 -2.995732
## 4 1.974081 -1.469676 0.32 2.1400662 -2.847312
## 5 1.974081 -1.469676 0.32 2.1400662 -2.847312
## 6 2.091864 -1.272966 0.40 1.9315214 -2.995732
## free.sulfur.dioxide total.sulfur.dioxide pH sulphates alcohol
## 1 3.806662 170 3.00 -0.7985077 2.174752
## 2 2.639057 132 3.30 -0.7133499 2.251292
## 3 3.401197 97 3.26 -0.8209806 2.312535
## 4 3.850148 186 3.19 -0.9162907 2.292535
## 5 3.850148 186 3.19 -0.9162907 2.292535
## 6 3.401197 97 3.26 -0.8209806 2.312535
## binned_quality
## 1 Intermediate
## 2 Intermediate
## 3 Intermediate
## 4 Intermediate
## 5 Intermediate
## 6 Intermediate
We can now proceed to the partitioning of the data, with a training set (50%), validation set (30%) and a test set (20%):
set.seed(1)
train_index = createDataPartition(logwine$binned_quality, p = .5, list = FALSE)
train_df = logwine[train_index,]
valid_test_df = logwine[-train_index,]
valid_index = createDataPartition(valid_test_df$binned_quality, p = .6, list = FALSE)
valid_df = valid_test_df[valid_index,]
test_df = valid_test_df[-valid_index,]
Because we use some classification techniques that require the data to be on a same scale (kNN, Neural Nets and Logistic Regression), we normalize the data on a [0,1] scale:
#initialize normalized training and validation data frames to the original ones
train_norm_df = train_df
valid_norm_df = valid_df
test_norm_df = test_df
#use PreProcess() from the caret package and predict() to normalize numerical variables
norm_values = preProcess(train_df[,-c(11)], method = "range")
train_norm_df[,-c(11)] = predict(norm_values, train_df[,-c(11)])
valid_norm_df[,-c(11)] = predict(norm_values, valid_df[,-c(11)])
test_norm_df[,-c(11)] = predict(norm_values, test_df[,-c(11)])
Because we perform multi-label classification, and because the neuralnet package requires the outcome variable to be coded as (0, 1) vector, we create 3 dummies, one for every quality level (‘High’, ‘Intermediate’ and ‘Low’). Note that we also apply this to validation and training sets:
# Initialize normalized training and validation data frames with dummies to the normalized ones
train_norm_dummy_df <- train_norm_df
valid_norm_dummy_df <- valid_norm_df
test_norm_dummy_df <- test_norm_df
# Creation of the vectors and implementation
train_norm_dummy_df$high_quality <- ifelse(train_norm_dummy_df$binned_quality == "High", 1, 0)
train_norm_dummy_df$intermediate_quality <- ifelse(train_norm_dummy_df$binned_quality == "Intermediate", 1, 0)
train_norm_dummy_df$low_quality <- ifelse(train_norm_dummy_df$binned_quality == "Low", 1, 0)
valid_norm_dummy_df$high_quality <- ifelse(valid_norm_dummy_df$binned_quality == "High", 1, 0)
valid_norm_dummy_df$intermediate_quality <- ifelse(valid_norm_dummy_df$binned_quality == "Intermediate", 1, 0)
valid_norm_dummy_df$low_quality <- ifelse(valid_norm_dummy_df$binned_quality == "Low", 1, 0)
test_norm_dummy_df$high_quality <- ifelse(test_norm_dummy_df$binned_quality == "High", 1, 0)
test_norm_dummy_df$intermediate_quality <- ifelse(test_norm_dummy_df$binned_quality == "Intermediate", 1, 0)
test_norm_dummy_df$low_quality <- ifelse(test_norm_dummy_df$binned_quality == "Low", 1, 0)
As stated earlier in the Data Exploratory Analysis, we face an imbalanced dataset:
table(logwine$binned_quality)
##
## High Intermediate Low
## 1060 3655 183
prop.table(table(logwine$binned_quality))
##
## High Intermediate Low
## 0.21641486 0.74622295 0.03736219
We see that ‘Intermediate’ quality represents 74.6% of observations, whereas ‘High’ and ‘Low’ quality only represent 21.6% and 3.7% of observations (respectively). Thus, the classification models could have problems in predicting ‘High’, and even more importantly ‘Low’ observations, as they only represent 3.7% of observations.
To solve this problem, we can oversample the ‘Low’ observations and add them into the training set. We use Random Walk Oversampling, as this method is known to be one of the most effective, especially when facing such an imbalanced dataset (Huaxiang Zhang, Mingfang Li, 2014).
Now, let’s create 1736 new instances for ‘Low’ and 1298 instances for ‘High’ quality (such that we have the same number of observations in each category within the training set) and let’s add them to the training set:
set.seed(1)
add_low_train_df = rwo(train_df, 1736, "binned_quality") # Generation of 1736 instances for 'Low'
os_train_df = rbind(train_df, add_low_train_df) # Combining the new instances to the training set
set.seed(1)
add_high_train_df = rwo(os_train_df, 1298, "binned_quality") # Generation of 1298 instances for 'High'
os_train_df = rbind(os_train_df, add_high_train_df)
Let’s normalize the newly generated observations, and add them to the normalize training set:
os_train_norm_df = os_train_df
#use PreProcess() from the caret package and predict() to normalize numerical variables
os_norm_values = preProcess(os_train_df[,-c(11)], method = "range")
os_train_norm_df[,-c(11)] = predict(os_norm_values, os_train_df[,-c(11)])
We can also add these newly generated observations to the normalized training set with dummies for quality (used for Neural Nets):
# Initialize oversampled normalized training set with dummies to the oversampled normalized one
os_train_norm_dummy_df <- os_train_norm_df
os_train_norm_dummy_df$high_quality <- ifelse(os_train_norm_dummy_df$binned_quality == "High", 1, 0)
os_train_norm_dummy_df$intermediate_quality <- ifelse(os_train_norm_dummy_df$binned_quality == "Intermediate", 1, 0)
os_train_norm_dummy_df$low_quality <- ifelse(os_train_norm_dummy_df$binned_quality == "Low", 1, 0)
In this part, we will proceed to the analysis using the following classification techniques: kNN, Ordinal Logistic Regression, Naive Bayes, Classification Trees (fitted to CP, Boosted Tree, Bagged Tree and Random Forest) and Neural Nets. Finally, we will combine the results to produce two ensemble methods based on Majority Voting and Mean Probability rules.
Note that we will use the training sets with oversampling on ‘Low’ quality observations for all methods.
Let’s proceed to the kNN:
#initialize a new data frame with three columns: k, kappa, and balanced_accuracy
best_k_df = data.frame(k = seq(1, 50, 1), kappa = rep(0,50), balanced_accuracy = rep(0,50))
#perform knn on the validation set using different k then store kappa and balanced accuracy for each k in the data frame
for(i in 1:50) {
knn_pred = knn(train = os_train_norm_df[,-11], test = valid_norm_df[,-11], cl = os_train_norm_df[,11], k = i)
best_k_df[i, 2] = confusionMatrix(knn_pred, valid_norm_df[,11])$overall[2]
low_sensitivity = confusionMatrix(knn_pred, valid_norm_df[,11])$byClass[3,1]
intermediate_sensitivity = confusionMatrix(knn_pred, valid_norm_df[,11])$byClass[2,1]
high_sensitivity = confusionMatrix(knn_pred, valid_norm_df[,11])$byClass[1,1]
best_k_df[i, 3] = (low_sensitivity + intermediate_sensitivity + high_sensitivity) / 3
}
kappa_plot = ggplot(data= best_k_df) + geom_line(aes(x=k,y=kappa), color="red") + theme_classic()
balanced_accuray_plot = ggplot(data= best_k_df) + geom_line(aes(x=k,y=balanced_accuracy), color="blue") + theme_classic()
knn_plot = plot_grid(kappa_plot, balanced_accuray_plot, ncol = 1, align = "v")
knn_plot
which.max(best_k_df$kappa)#best k based on kappa
## [1] 1
which.max(best_k_df$balanced_accuracy)#best k based on balanced accuracy
## [1] 28
From the plots above, it seems that the best number of neighbours (i.e., number of k’s) is k=1 if we want to maximize the balanced accuracy and the kappa value.
Choosing this parameter(k=1), we can assess the predictive performance of the kNN model on the test set:
#perform knn classification on the test set using best k = 1
set.seed(1)
best_knn_pred = knn3Train(train = os_train_norm_df[,-11], test = test_norm_df[,-11], cl = os_train_norm_df[,11], k = 1, prob = T)
best_knn_cm <- confusionMatrix(as.factor(best_knn_pred), test_norm_df[,11])#create corresponding confusion matrix
best_knn_cm
## Confusion Matrix and Statistics
##
## Reference
## Prediction High Intermediate Low
## High 141 114 2
## Intermediate 51 519 16
## Low 20 97 18
##
## Overall Statistics
##
## Accuracy : 0.6933
## 95% CI : (0.6633, 0.722)
## No Information Rate : 0.7464
## P-Value [Acc > NIR] : 0.9999
##
## Kappa : 0.3749
##
## Mcnemar's Test P-Value : <2e-16
##
## Statistics by Class:
##
## Class: High Class: Intermediate Class: Low
## Sensitivity 0.6651 0.7110 0.50000
## Specificity 0.8486 0.7298 0.87580
## Pos Pred Value 0.5486 0.8857 0.13333
## Neg Pred Value 0.9015 0.4617 0.97865
## Prevalence 0.2168 0.7464 0.03681
## Detection Rate 0.1442 0.5307 0.01840
## Detection Prevalence 0.2628 0.5992 0.13804
## Balanced Accuracy 0.7568 0.7204 0.68790
kappa_best_knn = best_knn_cm$overall[2]
kappa_best_knn
## Kappa
## 0.3748935
low_sensitivity = best_knn_cm$byClass[3,1]
intermediate_sensitivity = best_knn_cm$byClass[2,1]
high_sensitivity = best_knn_cm$byClass[1,1]
bal_acc_best_knn = (low_sensitivity + intermediate_sensitivity + high_sensitivity) / 3
bal_acc_best_knn
## [1] 0.6253511
os_log_reg = multinom(binned_quality ~ ., data = os_train_df)
## # weights: 36 (22 variable)
## initial value 6024.789791
## iter 10 value 4983.359463
## iter 20 value 4393.575474
## iter 30 value 4287.516936
## iter 30 value 4287.516926
## iter 30 value 4287.516926
## final value 4287.516926
## converged
summary(os_log_reg)
## Call:
## multinom(formula = binned_quality ~ ., data = os_train_df)
##
## Coefficients:
## (Intercept) fixed.acidity volatile.acidity citric.acid
## Intermediate 31.86889 -0.962256 1.332421 1.997239
## Low 41.98841 1.637545 3.410483 2.390439
## residual.sugar chlorides free.sulfur.dioxide total.sulfur.dioxide
## Intermediate -0.2389481 1.275859 -0.4741764 0.0001637821
## Low -0.4902087 1.084144 -2.0908903 0.0007713492
## pH sulphates alcohol
## Intermediate -0.9102455 -0.49837427 -8.518968
## Low 0.3831240 0.04088377 -13.564077
##
## Std. Errors:
## (Intercept) fixed.acidity volatile.acidity citric.acid
## Intermediate 0.03791899 0.2653690 0.1202564 0.3810659
## Low 0.05520153 0.3293996 0.1469022 0.4245366
## residual.sugar chlorides free.sulfur.dioxide total.sulfur.dioxide
## Intermediate 0.04616474 0.1677127 0.07507555 0.0002866020
## Low 0.05390061 0.1941242 0.08257761 0.0002893369
## pH sulphates alcohol
## Intermediate 0.1995491 0.1552705 0.3660252
## Low 0.2476004 0.1893426 0.4563050
##
## Residual Deviance: 8575.034
## AIC: 8619.034
os_log_prob = predict(os_log_reg, newdata = test_df, type = "p")
os_log_pred = data.frame("pred" = colnames(os_log_prob)[apply(os_log_prob,1,which.max)])
os_log_pred$pred = as.factor(os_log_pred$pred)
os_log_cm <- confusionMatrix(os_log_pred$pred, test_df[,11])
os_log_cm
## Confusion Matrix and Statistics
##
## Reference
## Prediction High Intermediate Low
## High 158 217 2
## Intermediate 41 363 8
## Low 13 150 26
##
## Overall Statistics
##
## Accuracy : 0.5593
## 95% CI : (0.5275, 0.5907)
## No Information Rate : 0.7464
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.2592
##
## Mcnemar's Test P-Value : <2e-16
##
## Statistics by Class:
##
## Class: High Class: Intermediate Class: Low
## Sensitivity 0.7453 0.4973 0.72222
## Specificity 0.7141 0.8024 0.82696
## Pos Pred Value 0.4191 0.8811 0.13757
## Neg Pred Value 0.9101 0.3516 0.98733
## Prevalence 0.2168 0.7464 0.03681
## Detection Rate 0.1616 0.3712 0.02658
## Detection Prevalence 0.3855 0.4213 0.19325
## Balanced Accuracy 0.7297 0.6498 0.77459
kappa_logist = os_log_cm$overall[2]
kappa_logist
## Kappa
## 0.2591899
low_sensitivity = os_log_cm$byClass[3,1]
intermediate_sensitivity = os_log_cm$byClass[2,1]
high_sensitivity = os_log_cm$byClass[1,1]
bal_acc_logist = (low_sensitivity + intermediate_sensitivity + high_sensitivity) / 3
bal_acc_logist
## [1] 0.6549218
os_nb_model = naiveBayes (binned_quality ~ ., data = os_train_df)
os_nb_model
##
## Naive Bayes Classifier for Discrete Predictors
##
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
##
## A-priori probabilities:
## Y
## High Intermediate Low
## 0.3333333 0.3333333 0.3333333
##
## Conditional probabilities:
## fixed.acidity
## Y [,1] [,2]
## High 1.901409 0.1148070
## Intermediate 1.918476 0.1220817
## Low 1.949177 0.1528686
##
## volatile.acidity
## Y [,1] [,2]
## High -1.390723 0.3460800
## Intermediate -1.339210 0.3204440
## Low -1.061166 0.3876045
##
## citric.acid
## Y [,1] [,2]
## High 0.3245435 0.0778726
## Intermediate 0.3371007 0.1261949
## Low 0.3079582 0.1617614
##
## residual.sugar
## Y [,1] [,2]
## High 1.337543 0.8285368
## Intermediate 1.549765 0.9327049
## Low 1.106317 0.9428895
##
## chlorides
## Y [,1] [,2]
## High -3.300988 0.2666303
## Intermediate -3.108617 0.3105590
## Low -3.075185 0.3467195
##
## free.sulfur.dioxide
## Y [,1] [,2]
## High 3.461398 0.4195593
## Intermediate 3.465685 0.5300177
## Low 2.780996 0.7982464
##
## total.sulfur.dioxide
## Y [,1] [,2]
## High 126.0414 50.7710
## Intermediate 142.7336 43.1156
## Low 133.7434 296.3143
##
## pH
## Y [,1] [,2]
## High 3.211628 0.1553305
## Intermediate 3.183485 0.1485863
## Low 3.191302 0.1664048
##
## sulphates
## Y [,1] [,2]
## High -0.7166881 0.2587124
## Intermediate -0.7382040 0.2114988
## Low -0.7587270 0.2463432
##
## alcohol
## Y [,1] [,2]
## High 2.426630 0.11687083
## Intermediate 2.325167 0.10633337
## Low 2.322142 0.09880537
os_nb_pred = predict(os_nb_model, newdata = test_df, type = "class")
os_nb_prob = predict(os_nb_model, newdata = test_df, type = "raw")
bayes_cm <- confusionMatrix(os_nb_pred, test_df[,11])
bayes_cm
## Confusion Matrix and Statistics
##
## Reference
## Prediction High Intermediate Low
## High 163 231 4
## Intermediate 46 433 13
## Low 3 66 19
##
## Overall Statistics
##
## Accuracy : 0.6288
## 95% CI : (0.5977, 0.6592)
## No Information Rate : 0.7464
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.3036
##
## Mcnemar's Test P-Value : <2e-16
##
## Statistics by Class:
##
## Class: High Class: Intermediate Class: Low
## Sensitivity 0.7689 0.5932 0.52778
## Specificity 0.6932 0.7621 0.92675
## Pos Pred Value 0.4095 0.8801 0.21591
## Neg Pred Value 0.9155 0.3889 0.98090
## Prevalence 0.2168 0.7464 0.03681
## Detection Rate 0.1667 0.4427 0.01943
## Detection Prevalence 0.4070 0.5031 0.08998
## Balanced Accuracy 0.7310 0.6776 0.72726
kappa_bayes = bayes_cm$overall[2]
kappa_bayes
## Kappa
## 0.3035937
low_sensitivity = bayes_cm$byClass[3,1]
intermediate_sensitivity = bayes_cm$byClass[2,1]
high_sensitivity = bayes_cm$byClass[1,1]
bal_acc_bayes = (low_sensitivity + intermediate_sensitivity + high_sensitivity) / 3
bal_acc_bayes
## [1] 0.6299321
Let’s first create a fully grown tree before pruning it to the best CP (i.e., CP associated with the minimum standard error):
set.seed(1)
tree1 <- rpart(binned_quality ~ ., data=os_train_df, method = "class", control = rpart.control(cp = 0, minbucket = 2, xval = 5))
# Look at the minimum standard error
printcp(tree1)
##
## Classification tree:
## rpart(formula = binned_quality ~ ., data = os_train_df, method = "class",
## control = rpart.control(cp = 0, minbucket = 2, xval = 5))
##
## Variables actually used in tree construction:
## [1] alcohol chlorides citric.acid
## [4] fixed.acidity free.sulfur.dioxide pH
## [7] residual.sugar sulphates total.sulfur.dioxide
## [10] volatile.acidity
##
## Root node error: 3656/5484 = 0.66667
##
## n= 5484
##
## CP nsplit rel error xerror xstd
## 1 0.18052516 0 1.000000 1.01614 0.0094687
## 2 0.02871991 3 0.458425 0.46827 0.0093860
## 3 0.01413202 4 0.429705 0.43682 0.0092025
## 4 0.00847921 7 0.387309 0.40673 0.0090047
## 5 0.00492341 8 0.378829 0.39278 0.0089052
## 6 0.00437637 10 0.368982 0.38074 0.0088152
## 7 0.00410284 11 0.364606 0.38020 0.0088110
## 8 0.00382932 13 0.356400 0.37062 0.0087365
## 9 0.00369256 15 0.348742 0.36761 0.0087125
## 10 0.00341904 17 0.341357 0.36296 0.0086750
## 11 0.00307713 19 0.334519 0.35175 0.0085820
## 12 0.00273523 23 0.322210 0.34655 0.0085376
## 13 0.00246171 26 0.314004 0.34272 0.0085044
## 14 0.00218818 28 0.309081 0.33835 0.0084659
## 15 0.00209701 35 0.293490 0.33835 0.0084659
## 16 0.00191466 39 0.285011 0.32221 0.0083187
## 17 0.00164114 49 0.265591 0.31865 0.0082851
## 18 0.00150438 60 0.247538 0.30525 0.0081549
## 19 0.00136761 65 0.238239 0.30060 0.0081083
## 20 0.00127644 83 0.213348 0.29458 0.0080468
## 21 0.00109409 86 0.209519 0.28446 0.0079405
## 22 0.00100292 103 0.189278 0.28474 0.0079434
## 23 0.00095733 108 0.183260 0.26422 0.0077163
## 24 0.00094213 110 0.181346 0.26422 0.0077163
## 25 0.00091174 122 0.169858 0.26422 0.0077163
## 26 0.00082057 132 0.160011 0.26395 0.0077131
## 27 0.00072939 164 0.133479 0.26094 0.0076784
## 28 0.00070334 170 0.129103 0.26012 0.0076688
## 29 0.00068381 177 0.124179 0.26012 0.0076688
## 30 0.00063822 190 0.115153 0.24015 0.0074277
## 31 0.00061543 194 0.112418 0.24125 0.0074414
## 32 0.00054705 202 0.107495 0.24043 0.0074311
## 33 0.00043764 255 0.078501 0.23359 0.0073445
## 34 0.00041028 263 0.074398 0.23468 0.0073585
## 35 0.00036470 302 0.056346 0.23441 0.0073550
## 36 0.00032823 305 0.055252 0.23441 0.0073550
## 37 0.00027352 316 0.051422 0.23085 0.0073093
## 38 0.00018235 350 0.042123 0.22839 0.0072773
## 39 0.00013676 353 0.041575 0.22921 0.0072880
## 40 0.00000000 357 0.041028 0.23085 0.0073093
which.min(tree1$cptable[, 4])
## 38
## 38
# Take the cp associated with the minimum standard error to prune the tree:
set.seed(1)
pruned_tree1 <- prune(tree1, cp = tree1$cptable[which.min(tree1$cptable[, "xerror"]), "CP"])
# If we want to look at the most important variables:
pruned_tree1$variable.importance
## total.sulfur.dioxide alcohol free.sulfur.dioxide
## 1294.0463 697.9393 648.7481
## chlorides volatile.acidity residual.sugar
## 536.7401 512.5792 483.5053
## pH sulphates citric.acid
## 437.1729 431.2653 424.1669
## fixed.acidity
## 377.5757
# We can now predict the outcome:
pruned_tree1_pred_test <- predict(pruned_tree1, test_df, type="class")
pruned_tree1_prob_test <- predict(pruned_tree1, test_df, type="prob")
pruned_tree1_cm <- confusionMatrix(pruned_tree1_pred_test, test_df$binned_quality)
pruned_tree1_cm
## Confusion Matrix and Statistics
##
## Reference
## Prediction High Intermediate Low
## High 124 114 0
## Intermediate 83 568 24
## Low 5 48 12
##
## Overall Statistics
##
## Accuracy : 0.7198
## 95% CI : (0.6905, 0.7478)
## No Information Rate : 0.7464
## P-Value [Acc > NIR] : 0.973312
##
## Kappa : 0.3479
##
## Mcnemar's Test P-Value : 0.000466
##
## Statistics by Class:
##
## Class: High Class: Intermediate Class: Low
## Sensitivity 0.5849 0.7781 0.33333
## Specificity 0.8512 0.5685 0.94374
## Pos Pred Value 0.5210 0.8415 0.18462
## Neg Pred Value 0.8811 0.4653 0.97371
## Prevalence 0.2168 0.7464 0.03681
## Detection Rate 0.1268 0.5808 0.01227
## Detection Prevalence 0.2434 0.6902 0.06646
## Balanced Accuracy 0.7180 0.6733 0.63854
kappa_pruned_tree1 = pruned_tree1_cm$overall[2]
kappa_pruned_tree1
## Kappa
## 0.3479016
low_sensitivity = pruned_tree1_cm$byClass[3,1]
intermediate_sensitivity = pruned_tree1_cm$byClass[2,1]
high_sensitivity = pruned_tree1_cm$byClass[1,1]
bal_acc_pruned_tree1 = (low_sensitivity + intermediate_sensitivity + high_sensitivity) / 3
bal_acc_pruned_tree1
## [1] 0.5654404
Let’s proceed to the Boosted Tree:
set.seed(1)
boost1 <- boosting(binned_quality ~ ., data=os_train_df, control = rpart.control(xval = 5))
boost1_pred_test <- predict(boost1, test_df, type="class")
boost1_prob_test <- predict(boost1, test_df, type="prob")
boost1_cm <- confusionMatrix(as.factor(boost1_pred_test$class), test_df$binned_quality)
boost1_cm
## Confusion Matrix and Statistics
##
## Reference
## Prediction High Intermediate Low
## High 158 195 3
## Intermediate 51 495 18
## Low 3 40 15
##
## Overall Statistics
##
## Accuracy : 0.683
## 95% CI : (0.6528, 0.7121)
## No Information Rate : 0.7464
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.3511
##
## Mcnemar's Test P-Value : <2e-16
##
## Statistics by Class:
##
## Class: High Class: Intermediate Class: Low
## Sensitivity 0.7453 0.6781 0.41667
## Specificity 0.7415 0.7218 0.95435
## Pos Pred Value 0.4438 0.8777 0.25862
## Neg Pred Value 0.9132 0.4324 0.97717
## Prevalence 0.2168 0.7464 0.03681
## Detection Rate 0.1616 0.5061 0.01534
## Detection Prevalence 0.3640 0.5767 0.05930
## Balanced Accuracy 0.7434 0.6999 0.68551
kappa_boost1 = boost1_cm$overall[2]
kappa_boost1
## Kappa
## 0.3510758
low_sensitivity = boost1_cm$byClass[3,1]
intermediate_sensitivity = boost1_cm$byClass[2,1]
high_sensitivity = boost1_cm$byClass[1,1]
bal_acc_boost1 = (low_sensitivity + intermediate_sensitivity + high_sensitivity) / 3
bal_acc_boost1
## [1] 0.613344
Let’s proceed to the Bagged Tree:
set.seed(1)
bag1 <- bagging(binned_quality ~ ., data=os_train_df, control = rpart.control(xval = 5))
bag1_pred_test <- predict(bag1, test_df, type="class")
bag1_prob_test <- predict(bag1, test_df, type="prob")
bag1_cm <- confusionMatrix(as.factor(bag1_pred_test$class), test_df$binned_quality)
bag1_cm
## Confusion Matrix and Statistics
##
## Reference
## Prediction High Intermediate Low
## High 170 228 5
## Intermediate 40 456 18
## Low 2 46 13
##
## Overall Statistics
##
## Accuracy : 0.6534
## 95% CI : (0.6226, 0.6832)
## No Information Rate : 0.7464
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.3284
##
## Mcnemar's Test P-Value : <2e-16
##
## Statistics by Class:
##
## Class: High Class: Intermediate Class: Low
## Sensitivity 0.8019 0.6247 0.36111
## Specificity 0.6958 0.7661 0.94904
## Pos Pred Value 0.4218 0.8872 0.21311
## Neg Pred Value 0.9270 0.4095 0.97492
## Prevalence 0.2168 0.7464 0.03681
## Detection Rate 0.1738 0.4663 0.01329
## Detection Prevalence 0.4121 0.5256 0.06237
## Balanced Accuracy 0.7489 0.6954 0.65508
kappa_bag1 = bag1_cm$overall[2]
kappa_bag1
## Kappa
## 0.328362
low_sensitivity = bag1_cm$byClass[3,1]
intermediate_sensitivity = bag1_cm$byClass[2,1]
high_sensitivity = bag1_cm$byClass[1,1]
bal_acc_bag1 = (low_sensitivity + intermediate_sensitivity + high_sensitivity) / 3
bal_acc_bag1
## [1] 0.5958851
Let’s proceed to the Random Forest:
set.seed(1)
rf1 <- randomForest(os_train_df[-11], os_train_df$binned_quality, control = rpart.control(xval = 5))
rf1_pred_test <- predict(rf1, test_df, type="class")
rf1_prob_test <- predict(rf1, test_df, type="prob")
rf1_cm <- confusionMatrix(rf1_pred_test, test_df$binned_quality)
rf1_cm
## Confusion Matrix and Statistics
##
## Reference
## Prediction High Intermediate Low
## High 138 79 0
## Intermediate 73 639 26
## Low 1 12 10
##
## Overall Statistics
##
## Accuracy : 0.8047
## 95% CI : (0.7784, 0.8291)
## No Information Rate : 0.7464
## P-Value [Acc > NIR] : 1.001e-05
##
## Kappa : 0.4964
##
## Mcnemar's Test P-Value : 0.09391
##
## Statistics by Class:
##
## Class: High Class: Intermediate Class: Low
## Sensitivity 0.6509 0.8753 0.27778
## Specificity 0.8969 0.6008 0.98620
## Pos Pred Value 0.6359 0.8659 0.43478
## Neg Pred Value 0.9028 0.6208 0.97277
## Prevalence 0.2168 0.7464 0.03681
## Detection Rate 0.1411 0.6534 0.01022
## Detection Prevalence 0.2219 0.7546 0.02352
## Balanced Accuracy 0.7739 0.7381 0.63199
kappa_rf1 = rf1_cm$overall[2]
kappa_rf1
## Kappa
## 0.4963819
low_sensitivity = rf1_cm$byClass[3,1]
intermediate_sensitivity = rf1_cm$byClass[2,1]
high_sensitivity = rf1_cm$byClass[1,1]
bal_acc_rf1 = (low_sensitivity + intermediate_sensitivity + high_sensitivity) / 3
bal_acc_rf1
## [1] 0.6013545
As stated earlier in the Data Exploratory Analysis, we will use the neuralnet package. To do so, we will use the oversampled and normalized training set that contains dummies for the outcome variable.
In order to choose the best number of nodes, we will use a loop (2 to 10 nodes) and look at the performance of the model on the validation set (based on the kappa value). We will repeat this operation twice, once with a threshold of 0.1, and once with a threshold of 0.2.
nn_nodes_1 <- data.frame("number_nodes" = seq(from= 2, to = 10), "kappa_value" = NA, "balanced_accuracy" = NA)
for (i in 2:10){
set.seed(1)
nn <- neuralnet(high_quality + intermediate_quality + low_quality ~ ., data = os_train_norm_dummy_df[, -11], hidden = i, linear.output = FALSE, threshold = 0.5)
valid_pred_nn <- data.frame(predict(nn, valid_norm_dummy_df[, -c(11, 12, 13, 14)]))
names(valid_pred_nn)[1:3] <- c("High", "Intermediate", "Low")
valid_pred_nn$Prediction <- NA
for(j in 1:length(valid_pred_nn$High)) {
valid_pred_nn[j, 4] <- names(which.max(valid_pred_nn[j, 1:3]))
}
nn.cm <- confusionMatrix(as.factor(valid_pred_nn$Prediction),valid_norm_df$binned_quality)
nn_nodes_1[i-1, 2] <- nn.cm[["overall"]][["Kappa"]]
nn_nodes_1[i-1, 3] <- (nn.cm$byClass[3,1] + nn.cm$byClass[2,1] + nn.cm$byClass[1,1]) / 3
}
nn_nodes_1
## number_nodes kappa_value balanced_accuracy
## 1 2 0.26520018 0.6130463
## 2 3 0.27326774 0.6208693
## 3 4 0.27504687 0.6223886
## 4 5 0.04396906 0.3871228
## 5 6 0.03760601 0.3682718
## 6 7 0.05115352 0.3827653
## 7 8 0.29838043 0.6199154
## 8 9 0.05168019 0.3990183
## 9 10 0.05484551 0.4012064
Best Neural Net is with threshold = 0.5 and 4 nodes:
set.seed(1)
nn1 <- neuralnet(high_quality + intermediate_quality + low_quality ~ ., data = train_norm_dummy_df[, -11], hidden = 4, linear.output = FALSE, threshold = 0.5)
test_pred_nn1 <- data.frame(predict(nn1, test_norm_dummy_df[, -c(11, 12, 13, 14)]))
names(test_pred_nn1)[1:3] <- c("High", "Intermediate", "Low")
test_pred_nn1$Prediction <- NA
for(j in 1:length(test_pred_nn1$High)) {
test_pred_nn1[j, 4] <- names(which.max(test_pred_nn1[j, 1:3]))
}
nn1_cm <- confusionMatrix(as.factor(test_pred_nn1$Prediction), test_norm_df$binned_quality)
nn1_cm
## Confusion Matrix and Statistics
##
## Reference
## Prediction High Intermediate Low
## High 70 44 2
## Intermediate 142 682 30
## Low 0 4 4
##
## Overall Statistics
##
## Accuracy : 0.773
## 95% CI : (0.7454, 0.7989)
## No Information Rate : 0.7464
## P-Value [Acc > NIR] : 0.02935
##
## Kappa : 0.2955
##
## Mcnemar's Test P-Value : 7.533e-16
##
## Statistics by Class:
##
## Class: High Class: Intermediate Class: Low
## Sensitivity 0.33019 0.9342 0.11111
## Specificity 0.93995 0.3065 0.99575
## Pos Pred Value 0.60345 0.7986 0.50000
## Neg Pred Value 0.83527 0.6129 0.96701
## Prevalence 0.21677 0.7464 0.03681
## Detection Rate 0.07157 0.6973 0.00409
## Detection Prevalence 0.11861 0.8732 0.00818
## Balanced Accuracy 0.63507 0.6203 0.55343
kappa_nn1 = nn1_cm$overall[2]
kappa_nn1
## Kappa
## 0.2954988
low_sensitivity = nn1_cm$byClass[3,1]
intermediate_sensitivity = nn1_cm$byClass[2,1]
high_sensitivity = nn1_cm$byClass[1,1]
bal_acc_nn1 = (low_sensitivity + intermediate_sensitivity + high_sensitivity) / 3
bal_acc_nn1
## [1] 0.4585155
results_df = data.frame("method" = c("knn", "logistic regression", "naive Bayes", "classification tree", "boosted tree", "bagged tree", "random forest", "neural net"),
"kappa" = c(kappa_best_knn, kappa_logist, kappa_bayes, kappa_pruned_tree1, kappa_boost1, kappa_bag1, kappa_rf1, kappa_nn1),
"balanced_accuracy" = c(bal_acc_best_knn, bal_acc_logist, bal_acc_bayes, bal_acc_pruned_tree1, bal_acc_boost1, bal_acc_bag1, bal_acc_rf1, bal_acc_nn1))
results_df
## method kappa balanced_accuracy
## 1 knn 0.3748935 0.6253511
## 2 logistic regression 0.2591899 0.6549218
## 3 naive Bayes 0.3035937 0.6299321
## 4 classification tree 0.3479016 0.5654404
## 5 boosted tree 0.3510758 0.6133440
## 6 bagged tree 0.3283620 0.5958851
## 7 random forest 0.4963819 0.6013545
## 8 neural net 0.2954988 0.4585155
all_kappa_plot = ggplot(data = results_df, aes(x = reorder(method, kappa))) +
geom_bar(aes(y = kappa), stat = "identity") +
labs(title = "Kappa for each method", x = "Method", y = "Kappa") +
theme(axis.text.x = element_text(angle=90))
all_bal_acc_plot = ggplot(data = results_df, aes(x = reorder(method, balanced_accuracy))) +
geom_bar(aes(y = balanced_accuracy), stat = "identity") +
labs(title = "Balanced accuracy for each method", x = "Method", y = "Balanced accuracy") +
theme(axis.text.x = element_text(angle=90))
results_plot1 = plot_grid(all_kappa_plot, all_bal_acc_plot)
results_plot1
ensemble_prob_low_df = data.frame("knn_prob_low" = attr(best_knn_pred, "prob")[,3],
"log_prob_low" = os_log_prob[,3],
"nb_prob_low" = os_nb_prob[,3],
"tree_prob_low" = pruned_tree1_prob_test[,3],
"boosting_prob_low" = boost1_prob_test$prob[,3],
"bagging_prob_low" = bag1_prob_test$prob[,3],
"rf_prob_low" = rf1_prob_test[,3],
"nnet_prob_low" = test_pred_nn1[,3])
ensemble_prob_low_df$mean_prob_low = apply(ensemble_prob_low_df[,c(1,2,5,7)], 1, mean)
head(ensemble_prob_low_df)
## knn_prob_low log_prob_low nb_prob_low tree_prob_low boosting_prob_low
## 2 0 0.77309686 0.13981175 0.000000 0.29894777
## 8 0 0.21146901 0.00792725 0.000000 0.13121962
## 10 0 0.22419847 0.02801932 0.000000 0.04075472
## 16 0 0.04869837 0.00276073 0.000000 0.00000000
## 21 0 0.42845514 0.14712599 0.000000 0.05555899
## 24 1 0.90074622 0.81086084 0.952381 0.64561551
## bagging_prob_low rf_prob_low nnet_prob_low mean_prob_low
## 2 0.55 0.100 0.088932689 0.29301116
## 8 0.00 0.014 0.004140789 0.08917216
## 10 0.00 0.056 0.020970957 0.08023830
## 16 0.00 0.000 0.011903364 0.01217459
## 21 0.00 0.002 0.057560142 0.12150353
## 24 0.44 0.596 0.579583604 0.78559043
ensemble_prob_inter_df = data.frame("knn_prob_inter" = attr(best_knn_pred, "prob")[,2],
"log_prob_inter" = os_log_prob[,2],
"nb_prob_inter" = os_nb_prob[,2],
"tree_prob_inter" = pruned_tree1_prob_test[,2],
"boosting_prob_inter" = boost1_prob_test$prob[,2],
"bagging_prob_inter" = bag1_prob_test$prob[,2],
"rf_prob_inter" = rf1_prob_test[,2],
"nnet_prob_inter" = test_pred_nn1[,2])
ensemble_prob_inter_df$mean_prob_inter = apply(ensemble_prob_inter_df[,c(1,2,5,7)], 1, mean)
head(ensemble_prob_inter_df)
## knn_prob_inter log_prob_inter nb_prob_inter tree_prob_inter
## 2 1 0.20335112 0.68476606 1.00000000
## 8 1 0.66187915 0.92371260 1.00000000
## 10 1 0.40644241 0.55929249 1.00000000
## 16 0 0.25746639 0.11256673 0.86666667
## 21 0 0.29900199 0.05309515 0.00000000
## 24 0 0.09574893 0.18696631 0.04761905
## boosting_prob_inter bagging_prob_inter rf_prob_inter nnet_prob_inter
## 2 0.6878942 0.45 0.878 0.9445283
## 8 0.6524447 1.00 0.808 0.8849977
## 10 0.4689815 0.00 0.594 0.7708970
## 16 0.3737503 0.00 0.430 0.4913243
## 21 0.2888439 0.00 0.072 0.5156588
## 24 0.3470573 0.56 0.396 0.6006858
## mean_prob_inter
## 2 0.6923113
## 8 0.7805810
## 10 0.6173560
## 16 0.2653042
## 21 0.1649615
## 24 0.2097016
ensemble_prob_high_df = data.frame("knn_prob_high" = attr(best_knn_pred, "prob")[,1],
"log_prob_high" = os_log_prob[,1],
"nb_prob_high" = os_nb_prob[,1],
"tree_prob_high" = pruned_tree1_prob_test[,1],
"boosting_prob_high" = boost1_prob_test$prob[,1],
"bagging_prob_high" = bag1_prob_test$prob[,1],
"rf_prob_high" = rf1_prob_test[,1],
"nnet_prob_high" = test_pred_nn1[,1])
ensemble_prob_high_df$mean_prob_high = apply(ensemble_prob_high_df[,c(1,2,5,7)], 1, mean)
head(ensemble_prob_high_df)
## knn_prob_high log_prob_high nb_prob_high tree_prob_high boosting_prob_high
## 2 0 0.023552014 0.175422185 0.0000000 0.013158039
## 8 0 0.126651841 0.068360150 0.0000000 0.216335678
## 10 0 0.369359126 0.412688190 0.0000000 0.490263760
## 16 1 0.693835240 0.884672538 0.1333333 0.626249668
## 21 1 0.272542870 0.799778864 1.0000000 0.655597113
## 24 0 0.003504844 0.002172852 0.0000000 0.007327199
## bagging_prob_high rf_prob_high nnet_prob_high mean_prob_high
## 2 0 0.022 0.02481099 0.014677513
## 8 0 0.178 0.10927765 0.130246880
## 10 1 0.350 0.17698168 0.302405721
## 16 1 0.570 0.49258641 0.722521227
## 21 1 0.926 0.39455833 0.713534996
## 24 0 0.008 0.05135879 0.004708011
ensemble_prob_df = data.frame("Low" = ensemble_prob_low_df$mean_prob_low,
"Intermediate" = ensemble_prob_inter_df$mean_prob_inter,
"High" = ensemble_prob_high_df$mean_prob_high)
ensemble_prob_df$mean_prob_pred = as.factor(colnames(ensemble_prob_df)[apply(ensemble_prob_df,1,which.max)])
head(ensemble_prob_df)
## Low Intermediate High mean_prob_pred
## 1 0.29301116 0.6923113 0.014677513 Intermediate
## 2 0.08917216 0.7805810 0.130246880 Intermediate
## 3 0.08023830 0.6173560 0.302405721 Intermediate
## 4 0.01217459 0.2653042 0.722521227 High
## 5 0.12150353 0.1649615 0.713534996 High
## 6 0.78559043 0.2097016 0.004708011 Low
ensembles_pred_df = data.frame("actual_value" = test_df$binned_quality,
"knn_pred" = best_knn_pred,
"log_pred" = os_log_pred$pred,
"nb_pred" = os_nb_pred,
"tree_pred" = pruned_tree1_pred_test,
"boosting_pred" = as.factor(boost1_pred_test$class),
"bagging_pred" = as.factor(bag1_pred_test$class),
"rf_pred" = rf1_pred_test,
"nnet_pred" = as.factor(test_pred_nn1$Prediction))
ensembles_pred_df$majority_vote_pred = as.factor(apply(ensembles_pred_df[,c(2,3,4,6,8)], 1, function(x) names(which.max(table(x)))))
ensembles_pred_df$mean_prob_pred = ensemble_prob_df$mean_prob_pred
head(ensembles_pred_df)
## actual_value knn_pred log_pred nb_pred tree_pred
## 2 Intermediate Intermediate Low Intermediate Intermediate
## 8 Intermediate Intermediate Intermediate Intermediate Intermediate
## 10 Intermediate Intermediate Intermediate Intermediate Intermediate
## 16 High High High High Intermediate
## 21 High High Low High High
## 24 Intermediate Low Low Low Low
## boosting_pred bagging_pred rf_pred nnet_pred majority_vote_pred
## 2 Intermediate Low Intermediate Intermediate Intermediate
## 8 Intermediate Intermediate Intermediate Intermediate Intermediate
## 10 High High Intermediate Intermediate Intermediate
## 16 High High High High High
## 21 High High High Intermediate High
## 24 Low Intermediate Low Intermediate Low
## mean_prob_pred
## 2 Intermediate
## 8 Intermediate
## 10 Intermediate
## 16 High
## 21 High
## 24 Low
majority_vote_cm <- confusionMatrix(ensembles_pred_df$majority_vote_pred, test_df[,11])
majority_vote_cm
## Confusion Matrix and Statistics
##
## Reference
## Prediction High Intermediate Low
## High 170 166 2
## Intermediate 39 520 16
## Low 3 44 18
##
## Overall Statistics
##
## Accuracy : 0.7239
## 95% CI : (0.6948, 0.7517)
## No Information Rate : 0.7464
## P-Value [Acc > NIR] : 0.9499
##
## Kappa : 0.4294
##
## Mcnemar's Test P-Value : <2e-16
##
## Statistics by Class:
##
## Class: High Class: Intermediate Class: Low
## Sensitivity 0.8019 0.7123 0.50000
## Specificity 0.7807 0.7782 0.95011
## Pos Pred Value 0.5030 0.9043 0.27692
## Neg Pred Value 0.9344 0.4789 0.98028
## Prevalence 0.2168 0.7464 0.03681
## Detection Rate 0.1738 0.5317 0.01840
## Detection Prevalence 0.3456 0.5879 0.06646
## Balanced Accuracy 0.7913 0.7453 0.72505
kappa_majority = majority_vote_cm$overall[2]
kappa_majority
## Kappa
## 0.4293531
low_sensitivity = majority_vote_cm$byClass[3,1]
intermediate_sensitivity = majority_vote_cm$byClass[2,1]
high_sensitivity = majority_vote_cm$byClass[1,1]
bal_acc_majority = (low_sensitivity + intermediate_sensitivity + high_sensitivity) / 3
bal_acc_majority
## [1] 0.6714052
mean_prob_cm <- confusionMatrix(ensembles_pred_df$mean_prob_pred, test_df[,11])
mean_prob_cm
## Confusion Matrix and Statistics
##
## Reference
## Prediction High Intermediate Low
## High 153 116 2
## Intermediate 55 561 13
## Low 4 53 21
##
## Overall Statistics
##
## Accuracy : 0.7515
## 95% CI : (0.7232, 0.7783)
## No Information Rate : 0.7464
## P-Value [Acc > NIR] : 0.3725
##
## Kappa : 0.4562
##
## Mcnemar's Test P-Value : 4.087e-10
##
## Statistics by Class:
##
## Class: High Class: Intermediate Class: Low
## Sensitivity 0.7217 0.7685 0.58333
## Specificity 0.8460 0.7258 0.93949
## Pos Pred Value 0.5646 0.8919 0.26923
## Neg Pred Value 0.9165 0.5158 0.98333
## Prevalence 0.2168 0.7464 0.03681
## Detection Rate 0.1564 0.5736 0.02147
## Detection Prevalence 0.2771 0.6431 0.07975
## Balanced Accuracy 0.7838 0.7471 0.76141
kappa_mean_prob = mean_prob_cm$overall[2]
kappa_mean_prob
## Kappa
## 0.4562365
low_sensitivity = mean_prob_cm$byClass[3,1]
intermediate_sensitivity = mean_prob_cm$byClass[2,1]
high_sensitivity = mean_prob_cm$byClass[1,1]
bal_acc_mean_prob = (low_sensitivity + intermediate_sensitivity + high_sensitivity) / 3
bal_acc_mean_prob
## [1] 0.6911749
results_df[9,] = list("majority vote", kappa_majority, bal_acc_majority)
results_df[10,] = list("mean probability", kappa_mean_prob, bal_acc_mean_prob)
all_kappa_plot2 = ggplot(data = results_df, aes(x = reorder(method, kappa))) +
geom_bar(aes(y = kappa), stat = "identity") +
labs(title = "Kappa for each method", x = "Method", y = "Kappa") +
theme(axis.text.x = element_text(angle=90))
all_bal_acc_plot2 = ggplot(data = results_df, aes(x = reorder(method, balanced_accuracy))) +
geom_bar(aes(y = balanced_accuracy), stat = "identity") +
labs(title = "Balanced accuracy for each method", x = "Method", y = "Balanced accuracy") +
theme(axis.text.x = element_text(angle=90))
results_plot2 = plot_grid(all_kappa_plot2, all_bal_acc_plot2)
results_plot2
We first create the data-frames associated to the confusion matrices of each method:
plot_knn_cm <- as.data.frame(best_knn_cm$table)
plot_knn_cm$Prediction <- factor(plot_knn_cm$Prediction, levels=rev(levels(plot_knn_cm$Prediction)))
plot_knn_cm <- plot_knn_cm %>%
mutate(pred = ifelse(plot_knn_cm$Prediction == plot_knn_cm$Reference, "correct", "error")) %>%
group_by(Reference) %>%
mutate(prop = Freq/sum(Freq))
plot_os_log_cm <- as.data.frame(os_log_cm$table)
plot_os_log_cm$Prediction <- factor(plot_os_log_cm$Prediction, levels=rev(levels(plot_os_log_cm$Prediction)))
plot_os_log_cm <- plot_os_log_cm %>%
mutate(pred = ifelse(plot_os_log_cm$Prediction == plot_os_log_cm$Reference, "correct", "error")) %>%
group_by(Reference) %>%
mutate(prop = Freq/sum(Freq))
plot_bayes_cm <- as.data.frame(bayes_cm$table)
plot_bayes_cm$Prediction <- factor(plot_bayes_cm$Prediction, levels=rev(levels(plot_bayes_cm$Prediction)))
plot_bayes_cm <- plot_bayes_cm %>%
mutate(pred = ifelse(plot_bayes_cm$Prediction == plot_bayes_cm$Reference, "correct", "error")) %>%
group_by(Reference) %>%
mutate(prop = Freq/sum(Freq))
plot_pruned_tree1_cm <- as.data.frame(pruned_tree1_cm$table)
plot_pruned_tree1_cm$Prediction <- factor(plot_pruned_tree1_cm$Prediction, levels=rev(levels(plot_pruned_tree1_cm$Prediction)))
plot_pruned_tree1_cm <- plot_pruned_tree1_cm %>%
mutate(pred = ifelse(plot_pruned_tree1_cm$Prediction == plot_pruned_tree1_cm$Reference, "correct", "error")) %>%
group_by(Reference) %>%
mutate(prop = Freq/sum(Freq))
plot_boost1_cm <- as.data.frame(boost1_cm$table)
plot_boost1_cm$Prediction <- factor(plot_boost1_cm$Prediction, levels=rev(levels(plot_boost1_cm$Prediction)))
plot_boost1_cm <- plot_boost1_cm %>%
mutate(pred = ifelse(plot_boost1_cm$Prediction == plot_boost1_cm$Reference, "correct", "error")) %>%
group_by(Reference) %>%
mutate(prop = Freq/sum(Freq))
plot_bag1_cm <- as.data.frame(bag1_cm$table)
plot_bag1_cm$Prediction <- factor(plot_bag1_cm$Prediction, levels=rev(levels(plot_bag1_cm$Prediction)))
plot_bag1_cm <- plot_bag1_cm %>%
mutate(pred = ifelse(plot_bag1_cm$Prediction == plot_bag1_cm$Reference, "correct", "error")) %>%
group_by(Reference) %>%
mutate(prop = Freq/sum(Freq))
plot_rf1_cm <- as.data.frame(rf1_cm$table)
plot_rf1_cm$Prediction <- factor(plot_rf1_cm$Prediction, levels=rev(levels(plot_rf1_cm$Prediction)))
plot_rf1_cm <- plot_rf1_cm %>%
mutate(pred = ifelse(plot_rf1_cm$Prediction == plot_rf1_cm$Reference, "correct", "error")) %>%
group_by(Reference) %>%
mutate(prop = Freq/sum(Freq))
plot_nn1_cm <- as.data.frame(nn1_cm$table)
plot_nn1_cm$Prediction <- factor(plot_nn1_cm$Prediction, levels=rev(levels(plot_nn1_cm$Prediction)))
plot_nn1_cm <- plot_nn1_cm %>%
mutate(pred = ifelse(plot_nn1_cm$Prediction == plot_nn1_cm$Reference, "correct", "error")) %>%
group_by(Reference) %>%
mutate(prop = Freq/sum(Freq))
plot_majority_vote_cm <- as.data.frame(majority_vote_cm$table)
plot_majority_vote_cm$Prediction <- factor(plot_majority_vote_cm$Prediction, levels=rev(levels(plot_majority_vote_cm$Prediction)))
plot_majority_vote_cm <- plot_majority_vote_cm %>%
mutate(pred = ifelse(plot_majority_vote_cm$Prediction == plot_majority_vote_cm$Reference, "correct", "error")) %>%
group_by(Reference) %>%
mutate(prop = Freq/sum(Freq))
plot_mean_prob_cm <- as.data.frame(mean_prob_cm$table)
plot_mean_prob_cm$Prediction <- factor(plot_mean_prob_cm$Prediction, levels=rev(levels(plot_mean_prob_cm$Prediction)))
plot_mean_prob_cm <- plot_mean_prob_cm %>%
mutate(pred = ifelse(plot_mean_prob_cm$Prediction == plot_mean_prob_cm$Reference, "correct", "error")) %>%
group_by(Reference) %>%
mutate(prop = Freq/sum(Freq))
Let’s create the plots:
plot_knn_cm <- ggplot(plot_knn_cm, aes(Reference, Prediction, fill = pred, alpha = prop)) +
geom_tile() +
theme_bw() +
geom_text(aes(label=Freq), vjust = .5, fontface = "bold", alpha = 1) +
scale_fill_manual(values = c(correct = "#5DADE2", error = "#EC7063")) +
xlim((levels(plot_knn_cm$Reference))) +
labs(x = "Reference",y = "Prediction", title = "Confusion Matrix: kNN")
plot_knn_cm
plot_os_log_cm <- ggplot(plot_os_log_cm, aes(Reference, Prediction, fill = pred, alpha = prop)) +
geom_tile() +
theme_bw() +
geom_text(aes(label=Freq), vjust = .5, fontface = "bold", alpha = 1) +
scale_fill_manual(values = c(correct = "#5DADE2", error = "#EC7063")) +
xlim((levels(plot_os_log_cm$Reference))) +
labs(x = "Reference",y = "Prediction", title = "Confusion Matrix: Logistic Regression")
plot_os_log_cm
plot_bayes_cm <- ggplot(plot_bayes_cm, aes(Reference, Prediction, fill = pred, alpha = prop)) +
geom_tile() +
theme_bw() +
geom_text(aes(label=Freq), vjust = .5, fontface = "bold", alpha = 1) +
scale_fill_manual(values = c(correct = "#5DADE2", error = "#EC7063")) +
xlim((levels(plot_bayes_cm$Reference))) +
labs(x = "Reference",y = "Prediction", title = "Confusion Matrix: Naive Bayes")
plot_bayes_cm
plot_pruned_tree1_cm <- ggplot(plot_pruned_tree1_cm, aes(Reference, Prediction, fill = pred, alpha = prop)) +
geom_tile() +
theme_bw() +
geom_text(aes(label=Freq), vjust = .5, fontface = "bold", alpha = 1) +
scale_fill_manual(values = c(correct = "#5DADE2", error = "#EC7063")) +
xlim((levels(plot_pruned_tree1_cm$Reference))) +
labs(x = "Reference",y = "Prediction", title = "Confusion Matrix: Pruned Tree")
plot_pruned_tree1_cm
plot_boost1_cm <- ggplot(plot_boost1_cm, aes(Reference, Prediction, fill = pred, alpha = prop)) +
geom_tile() +
theme_bw() +
geom_text(aes(label=Freq), vjust = .5, fontface = "bold", alpha = 1) +
scale_fill_manual(values = c(correct = "#5DADE2", error = "#EC7063")) +
xlim((levels(plot_boost1_cm$Reference))) +
labs(x = "Reference",y = "Prediction", title = "Confusion Matrix: Boosted Tree")
plot_boost1_cm
plot_bag1_cm <- ggplot(plot_bag1_cm, aes(Reference, Prediction, fill = pred, alpha = prop)) +
geom_tile() +
theme_bw() +
geom_text(aes(label=Freq), vjust = .5, fontface = "bold", alpha = 1) +
scale_fill_manual(values = c(correct = "#5DADE2", error = "#EC7063")) +
xlim((levels(plot_bag1_cm$Reference))) +
labs(x = "Reference",y = "Prediction", title = "Confusion Matrix: Bagged Tree")
plot_bag1_cm
plot_rf1_cm <- ggplot(plot_rf1_cm, aes(Reference, Prediction, fill = pred, alpha = prop)) +
geom_tile() +
theme_bw() +
geom_text(aes(label=Freq), vjust = .5, fontface = "bold", alpha = 1) +
scale_fill_manual(values = c(correct = "#5DADE2", error = "#EC7063")) +
xlim((levels(plot_rf1_cm$Reference))) +
labs(x = "Reference",y = "Prediction", title = "Confusion Matrix: Random Forest")
plot_rf1_cm
plot_nn1_cm <- ggplot(plot_nn1_cm, aes(Reference, Prediction, fill = pred, alpha = prop)) +
geom_tile() +
theme_bw() +
geom_text(aes(label=Freq), vjust = .5, fontface = "bold", alpha = 1) +
scale_fill_manual(values = c(correct = "#5DADE2", error = "#EC7063")) +
xlim((levels(plot_nn1_cm$Reference))) +
labs(x = "Reference",y = "Prediction", title = "Confusion Matrix: Neural Net")
plot_nn1_cm
plot_majority_vote_cm <- ggplot(plot_majority_vote_cm, aes(Reference, Prediction, fill = pred, alpha = prop)) +
geom_tile() +
theme_bw() +
geom_text(aes(label=Freq), vjust = .5, fontface = "bold", alpha = 1) +
scale_fill_manual(values = c(correct = "#5DADE2", error = "#EC7063")) +
xlim((levels(plot_majority_vote_cm$Reference))) +
labs(x = "Reference",y = "Prediction", title = "Confusion Matrix: Majority Vote (Ensembles)")
plot_majority_vote_cm
plot_mean_prob_cm <- ggplot(plot_mean_prob_cm, aes(Reference, Prediction, fill = pred, alpha = prop)) +
geom_tile() +
theme_bw() +
geom_text(aes(label=Freq), vjust = .5, fontface = "bold", alpha = 1) +
scale_fill_manual(values = c(correct = "#5DADE2", error = "#EC7063")) +
xlim((levels(plot_mean_prob_cm$Reference))) +
labs(x = "Reference",y = "Prediction", title = "Confusion Matrix: Mean Probability (Ensembles)")
plot_mean_prob_cm